數據科學概論 Hw6 (含 Hw5 以前)

0416235 劉昱劭

Hw3 Hw4 Hw5 Hw6

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
In [3]:
protein = pd.read_csv("nuclear.csv")
In [4]:
# mean & stdev of each class
print('Mean(cm): \n', protein.groupby(['class']).mean())
print('-'*80,'\n')
print('Stdev(cm): \n', protein.groupby(['class']).std())
print('-'*80)
Mean(cm): 
         DYRK1A_N   ITSN1_N    BDNF_N     NR1_N    NR2A_N    pAKT_N   pBRAF_N  \
class                                                                          
c-CS-m  0.480456  0.652587  0.339217  2.381749  4.308540  0.229932  0.182211   
c-CS-s  0.596748  0.772395  0.342315  2.417809  4.280077  0.212423  0.168356   
c-SC-m  0.273203  0.436361  0.290946  2.145633  3.459416  0.241253  0.189547   
c-SC-s  0.274823  0.449354  0.313393  2.404974  3.913096  0.233368  0.184975   
t-CS-m  0.619294  0.797007  0.312732  2.196541  3.565960  0.213621  0.173956   
t-CS-s  0.525735  0.759556  0.305460  2.184606  3.514839  0.214466  0.164795   
t-SC-m  0.329861  0.566783  0.321063  2.379446  4.056223  0.269131  0.201007   
t-SC-s  0.337488  0.549056  0.325586  2.248742  3.565093  0.246759  0.185318   

        pCAMKII_N   pCREB_N    pELK_N    ...        SHH_N     BAD_N    BCL2_N  \
class                                    ...                                    
c-CS-m   2.916187  0.198484  1.492318    ...     0.226911  0.156882  0.132539   
c-CS-s   2.935576  0.208439  1.686844    ...     0.214818  0.144670  0.126590   
c-SC-m   4.736327  0.208149  1.278566    ...     0.224470  0.169294  0.155980   
c-SC-s   3.361288  0.214949  1.327714    ...     0.241853  0.156121  0.133082   
t-CS-m   3.121801  0.203395  1.563905    ...     0.216934  0.150572  0.132005   
t-CS-s   2.488902  0.210041  1.518302    ...     0.222144  0.150884  0.130186   
t-SC-m   4.277257  0.231789  1.381516    ...     0.230667  0.153697  0.137467   
t-SC-s   4.176555  0.227165  1.204840    ...     0.234828  0.174648  0.137082   

           pS6_N   pCFOS_N     SYP_N  H3AcK18_N    EGR1_N  H3MeK4_N    CaNA_N  
class                                                                          
c-CS-m  0.119782  0.123929  0.467403   0.142027  0.174599  0.178675  1.523659  
c-CS-s  0.112512  0.126954  0.445322   0.145542  0.166534  0.172007  1.617609  
c-SC-m  0.128108  0.143614  0.456874   0.185664  0.217339  0.220825  1.009957  
c-SC-s  0.132929  0.135007  0.471512   0.152480  0.200690  0.219120  1.157639  
t-CS-m  0.108196  0.127762  0.413597   0.149539  0.162359  0.180920  1.633341  
t-CS-s  0.111354  0.121971  0.433587   0.158388  0.158174  0.191430  1.552633  
t-SC-m  0.137287  0.130234  0.445738   0.226197  0.196089  0.247652  1.041134  
t-SC-s  0.119201  0.133663  0.428207   0.208931  0.184275  0.226498  1.229815  

[8 rows x 77 columns]
-------------------------------------------------------------------------------- 

Stdev(cm): 
         DYRK1A_N   ITSN1_N    BDNF_N     NR1_N    NR2A_N    pAKT_N   pBRAF_N  \
class                                                                          
c-CS-m  0.129722  0.153911  0.046883  0.320972  0.973575  0.024935  0.019315   
c-CS-s  0.529338  0.503299  0.054885  0.368772  1.001889  0.052908  0.033505   
c-SC-m  0.047923  0.068982  0.038587  0.286331  0.670048  0.037195  0.030927   
c-SC-s  0.046671  0.074120  0.043637  0.353018  0.872614  0.030627  0.023067   
t-CS-m  0.156636  0.187409  0.051175  0.379840  0.876532  0.027822  0.020831   
t-CS-s  0.139950  0.154217  0.043350  0.275957  0.636877  0.035932  0.018745   
t-SC-m  0.078622  0.077883  0.033216  0.258372  0.837798  0.038921  0.024047   
t-SC-s  0.074539  0.111466  0.057544  0.382037  1.011215  0.045216  0.023108   

        pCAMKII_N   pCREB_N    pELK_N    ...        SHH_N     BAD_N    BCL2_N  \
class                                    ...                                    
c-CS-m   0.926526  0.027469  0.302739    ...     0.025874  0.027351  0.029466   
c-CS-s   0.706755  0.038301  0.908641    ...     0.018474  0.020545  0.017551   
c-SC-m   0.847418  0.033489  0.263146    ...     0.032705  0.040022  0.033346   
c-SC-s   0.864917  0.029505  0.571705    ...     0.030560  0.022719  0.017994   
t-CS-m   1.505956  0.024670  0.287460    ...     0.021517  0.029802  0.030342   
t-CS-s   0.755425  0.028183  0.285628    ...     0.025197  0.030728  0.031194   
t-SC-m   1.018710  0.028202  0.164232    ...     0.035801  0.027166  0.025937   
t-SC-s   1.531858  0.034214  0.253260    ...     0.027322  0.022806  0.022206   

           pS6_N   pCFOS_N     SYP_N  H3AcK18_N    EGR1_N  H3MeK4_N    CaNA_N  
class                                                                          
c-CS-m  0.016945  0.018875  0.062128   0.025188  0.037033  0.044230  0.201078  
c-CS-s  0.009643  0.017445  0.077240   0.037531  0.023459  0.033198  0.269305  
c-SC-m  0.006325  0.032573  0.050533   0.052532  0.050054  0.052955  0.135937  
c-SC-s  0.006891  0.024200  0.061357   0.033508  0.039118  0.059739  0.165710  
t-CS-m  0.008116  0.020997  0.064204   0.036571  0.033867  0.038142  0.212176  
t-CS-s  0.010461  0.018399  0.080797   0.051629  0.022450  0.053142  0.237452  
t-SC-m  0.010215  0.023143  0.048419   0.110055  0.038481  0.065826  0.158194  
t-SC-s  0.010064  0.021271  0.065896   0.056441  0.031866  0.044017  0.188082  

[8 rows x 77 columns]
--------------------------------------------------------------------------------
In [5]:
proteinData = [protein.iloc[:,1:18], protein.iloc[:,11:21], protein.iloc[:,21:31], protein.iloc[:,31:41], 
               protein.iloc[:,41:51], protein.iloc[:,51:61], protein.iloc[:,61:71], protein.iloc[:,71:78]]
#print(proteinData)
In [6]:
proteinClass = protein.iloc[:, 78:82]
#print(proteinClass)

Deal with missing value (Hw3)

Top

  1. Drop missing value
  2. Fill missing value with mean
In [7]:
# 挑出要分析的 attribute,和 class
sub_protein = pd.concat([proteinData[0],proteinClass], axis=1)
sub_protein.describe(percentiles=[])
Out[7]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N
count 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000
mean 0.425810 0.617102 0.319088 2.297269 3.843934 0.233168 0.181846 3.537109 0.212574 1.428682 0.545904 0.313505 0.317939 0.275033 0.825813 0.726933 1.561965
std 0.249362 0.251640 0.049383 0.347293 0.933100 0.041634 0.027042 1.295169 0.032587 0.466904 0.345309 0.051978 0.052236 0.046164 0.117969 0.188013 0.270737
min 0.145327 0.245359 0.115181 1.330831 1.737540 0.063236 0.064043 1.343998 0.112812 0.429032 0.149155 0.052110 0.191431 0.056818 0.500160 0.281285 0.301609
50% 0.366378 0.565782 0.316564 2.296546 3.760855 0.231177 0.182302 3.326520 0.210594 1.355846 0.443644 0.321330 0.312977 0.277393 0.821076 0.719591 1.563696
max 2.516367 2.602662 0.497160 3.757641 8.482553 0.539050 0.317066 7.464070 0.306247 6.113347 3.566685 0.493426 0.473992 0.458001 1.408169 1.412750 2.723965
In [8]:
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)
['DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N', 'pAKT_N', 'pBRAF_N', 'pCAMKII_N', 'pCREB_N', 'pELK_N', 'pERK_N', 'pJNK_N', 'PKCA_N', 'pMEK_N', 'pNR1_N', 'pNR2A_N', 'pNR2B_N']
In [9]:
## drop 
proteinClean = sub_protein.dropna()
#print(proteinClean)
In [10]:
## fill missing value with mean of each group
fill_protein = sub_protein.copy()
for n in na_cols:
    fill_protein[n] = fill_protein.groupby(['class'], sort=False)[n].apply(lambda x: x.fillna(x.mean()))
    #protein[x] = protein.groupby("class").transform(lambda x: x.fillna(x.mean()))
    #for c in proteinClass['class'].unique():
    #print(c)
#proteinFill = protein.fillna(,inplace=True)
In [11]:
# 印出有 missing value 的 column

## 未處理 missing value 的原資料
na_cols = sub_protein.columns[sub_protein.isna().any()].tolist()
print(na_cols)

## fill_protein 已用平均值填補完,沒有 missing value,印出 empty list
na_cols = fill_protein.columns[fill_protein.isna().any()].tolist()
print(na_cols)
['DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N', 'pAKT_N', 'pBRAF_N', 'pCAMKII_N', 'pCREB_N', 'pELK_N', 'pERK_N', 'pJNK_N', 'PKCA_N', 'pMEK_N', 'pNR1_N', 'pNR2A_N', 'pNR2B_N']
[]
In [12]:
sub_protein.describe(percentiles=[], include='all')
Out[12]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N ... pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N Genotype Treatment Behavior class
count 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 ... 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1077.000000 1080 1080 1080 1080
unique NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 2 2 2 8
top NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN Control Memantine S/C c-CS-m
freq NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 570 570 555 150
mean 0.425810 0.617102 0.319088 2.297269 3.843934 0.233168 0.181846 3.537109 0.212574 1.428682 ... 0.313505 0.317939 0.275033 0.825813 0.726933 1.561965 NaN NaN NaN NaN
std 0.249362 0.251640 0.049383 0.347293 0.933100 0.041634 0.027042 1.295169 0.032587 0.466904 ... 0.051978 0.052236 0.046164 0.117969 0.188013 0.270737 NaN NaN NaN NaN
min 0.145327 0.245359 0.115181 1.330831 1.737540 0.063236 0.064043 1.343998 0.112812 0.429032 ... 0.052110 0.191431 0.056818 0.500160 0.281285 0.301609 NaN NaN NaN NaN
50% 0.366378 0.565782 0.316564 2.296546 3.760855 0.231177 0.182302 3.326520 0.210594 1.355846 ... 0.321330 0.312977 0.277393 0.821076 0.719591 1.563696 NaN NaN NaN NaN
max 2.516367 2.602662 0.497160 3.757641 8.482553 0.539050 0.317066 7.464070 0.306247 6.113347 ... 0.493426 0.473992 0.458001 1.408169 1.412750 2.723965 NaN NaN NaN NaN

9 rows × 21 columns

Visualization (Hw4)

Top

  1. 線圖 (line plot)
  2. 盒鬚圖 (box plot)
  3. 直方圖 (histogram)
  4. 長條圖 (bar chart)
  5. 散佈圖 (scatter plot)
  6. 不使用圓餅圖 (Why I didn't use pie chart)

1. 線圖

  • 從 77 個蛋白質表現隨意挑一些出來畫折線圖
  • 此例我挑前五個 attribute,['DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N']
  • 可知每個 attribute 值域分佈的範圍不同,此 dataset 沒有對他們做標準化,若要做後續分析可能要考慮這件事。
  • y 軸是其數值(表現量)
  • x 軸是第幾筆資料 (原資料有 1080 筆)
In [12]:
print(list(fill_protein.iloc[:,0:5]))
fill_protein.iloc[:,0:5].plot()
['DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N']
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f803a724a20>

2. 盒鬚圖

  • 以 DYRK1A_N 為例
  • 將資料分組畫 boxplot 可看出各 class 的資料離散程度
  • 例如 c-CS-s 就有一些離群值,如同作業三最後提到的。
In [13]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f803a77b5f8>

3. 直方圖

  • 此例我挑前 4 個 attribute,['DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N']
  • 和 box plot 一樣可看出資料離散程度,像 DYRK1A_N 和 ITSN1_N 都有一些右邊的離群值
  • histogram 看不出 box plot 能表示的四分位距和平均值
    • 但可看出眾數大概的位置,如 DYRK1A_N 的眾數大概落在 0.25
    • (box plot 看不出來)
  • y 軸是資料筆數
  • x 軸是蛋白質表現程度的數值 (每個區間寬度是 0.25)
In [14]:
fill_protein.iloc[:,0:4].hist(figsize=(8,6))
Out[14]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f80385f7ef0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f803858b6a0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f80385a3908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f80385bdb70>]],
      dtype=object)
  • 只觀察 DYRK1A_N, 將其以 Genotype 分為 Control(正常) 和 Ts65Dn(唐氏症) 分別作圖
  • 可以看到 Control 有較多導致右偏的離群值,應為 c-CS-s 類的那幾個點。
In [15]:
fill_protein.groupby('Genotype').hist(column=['DYRK1A_N'])
Out[15]:
Genotype
Control    [[AxesSubplot(0.125,0.125;0.775x0.755)]]
Ts65Dn     [[AxesSubplot(0.125,0.125;0.775x0.755)]]
dtype: object

4. 散佈圖

  • 將前 17 個 attribute 任兩個取起來畫圖 (pairplot)
  • 可以看出其相關性
  • 對角線則是該 attribute 分組的折線圖(類似 histogram)
In [16]:
sns.pairplot(fill_protein, hue='class')
Out[16]:
<seaborn.axisgrid.PairGrid at 0x7f8038422828>
  • 各 attribute 的 histogram (參考)
In [17]:
fill_protein.plot(kind='hist')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8030716390>

例 4-1

  • 挑出 NR1_N 和 pNR1_N 畫散佈圖,可觀察知這兩種蛋白質的表現程度成高度正相關(對於所有樣本)。
  • 實際上,兩種是類似的蛋白質(皆為 NMDA 受器),因此生物機制上的確有高度相關性。
In [13]:
Relation = pd.concat([fill_protein['NR1_N'], fill_protein['pNR1_N']], axis=1)
Relation = pd.concat([Relation, proteinClass], axis=1)

sns.pairplot(Relation, hue='class')
Out[13]:
<seaborn.axisgrid.PairGrid at 0x1101bef98>

例 4-2

  • 一次畫上 8 種 class 的點有點繁雜
    附上以基因型(Genotype)、藥物注射(Treatment)、學習刺激(Behavior)分別著色的散佈圖。
In [19]:
sns.pairplot(Relation, hue='Genotype')
sns.pairplot(Relation, hue='Treatment', palette="husl")
sns.pairplot(Relation, hue='Behavior', palette="Set2")
Out[19]:
<seaborn.axisgrid.PairGrid at 0x7f80291aeb70>

例 4-3

  • 挑出 DYRK1A_N 和 ITSN1_N 畫散佈圖,除了兩個 attribute 成正相關。
  • 同時可發現綠色的點 (c-CS-s) 對於此兩 attribute 都算是離群值。
In [20]:
Relation2 = pd.concat([fill_protein['DYRK1A_N'], fill_protein['ITSN1_N']], axis=1)
Relation2 = pd.concat([Relation2, proteinClass], axis=1)

sns.pairplot(Relation2, hue='class')
Out[20]:
<seaborn.axisgrid.PairGrid at 0x7f8028f87208>

5. 不使用圓餅圖

1. IQR

  • 以前 17 個蛋白質表現量為例,其實直接下 describe() 就會列出四分位數 (25%, 50%, 75%)
In [21]:
fill_protein.describe()
Out[21]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N
count 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000 1080.000000
mean 0.425565 0.616913 0.319106 2.297134 3.843159 0.233206 0.181856 3.538885 0.212614 1.428060 0.545381 0.313519 0.317961 0.275076 0.825752 0.726782 1.561612
std 0.249058 0.251316 0.049316 0.346819 0.931918 0.041583 0.027005 1.293806 0.032551 0.466403 0.344971 0.051906 0.052165 0.046106 0.117811 0.187773 0.270444
min 0.145327 0.245359 0.115181 1.330831 1.737540 0.063236 0.064043 1.343998 0.112812 0.429032 0.149155 0.052110 0.191431 0.056818 0.500160 0.281285 0.301609
25% 0.288163 0.473669 0.287650 2.059152 3.160287 0.205821 0.164619 2.479861 0.190828 1.204546 0.337486 0.281530 0.281850 0.244294 0.743594 0.591311 1.381308
50% 0.366125 0.565494 0.316703 2.295648 3.738908 0.231246 0.182472 3.329624 0.210681 1.355423 0.443122 0.321266 0.313028 0.277463 0.820850 0.719527 1.563239
75% 0.487574 0.697500 0.348039 2.528035 4.425107 0.257225 0.197226 4.480652 0.234558 1.560931 0.663173 0.348692 0.352272 0.303355 0.898339 0.847276 1.748498
max 2.516367 2.602662 0.497160 3.757641 8.482553 0.539050 0.317066 7.464070 0.306247 6.113347 3.566685 0.493426 0.473992 0.458001 1.408169 1.412750 2.723965
  • 或我們可以只取 25th 百分位數和 75th 百分位數求差,即為四分位距 (IQR)
In [22]:
Q1 = fill_protein.quantile(q=0.25)
Q3 = fill_protein.quantile(q=0.75)
IQR = Q3-Q1

row_name=['Q3', 'Q1', 'IQR']
IQR_DF = pd.DataFrame([Q3, Q1, IQR], row_name)
IQR_DF.assign()
Out[22]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N pERK_N pJNK_N PKCA_N pMEK_N pNR1_N pNR2A_N pNR2B_N
Q3 0.487574 0.697500 0.348039 2.528035 4.425107 0.257225 0.197226 4.480652 0.234558 1.560931 0.663173 0.348692 0.352272 0.303355 0.898339 0.847276 1.748498
Q1 0.288163 0.473669 0.287650 2.059152 3.160287 0.205821 0.164619 2.479861 0.190828 1.204546 0.337486 0.281530 0.281850 0.244294 0.743594 0.591311 1.381308
IQR 0.199411 0.223832 0.060388 0.468882 1.264820 0.051404 0.032608 2.000792 0.043730 0.356385 0.325687 0.067162 0.070422 0.059060 0.154744 0.255965 0.367190

2. Sorting

  • 取前五個蛋白質表現量為例,預設為 descending
  • 為了方便看結果,下 head() 只取前五 row
In [23]:
fill_protein.iloc[:,0:5].sort_values(by=['NR1_N']).head()
Out[23]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N
957 0.216134 0.364058 0.241693 1.330831 1.737540
958 0.212369 0.369960 0.239036 1.346827 1.814940
192 0.145327 0.245359 0.185980 1.403329 2.290973
974 0.226491 0.380497 0.194160 1.414914 1.794716
954 0.238836 0.384145 0.234618 1.440000 1.928436
  • 其實也可以用多變數排序,例如先排 DYRK1A_N,有相等的 DYRK1A_N 再以 ITSN1_N 排序
  • 但因為此資料集為浮點數,幾乎沒什麼相同的值,所以通常用一個變數就能排序完了
In [24]:
fill_protein.iloc[:,0:5].sort_values(by=['DYRK1A_N', 'ITSN1_N']).head()
Out[24]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N
192 0.145327 0.245359 0.185980 1.403329 2.290973
193 0.156849 0.261185 0.197271 1.496977 2.473830
1063 0.163325 0.332454 0.255145 1.901847 2.445910
189 0.164330 0.290535 0.212387 1.670465 2.819603
190 0.168493 0.292269 0.198159 1.634955 2.726874

3. correlation matrix

  • 為方便看結果,挑前十項蛋白質表現量印出 correlation matrix 及做圖 (heatmap)

3-1 correlation matrix

In [57]:
pcorr=fill_protein.iloc[:,0:10].corr()
pcorr.assign()
Out[57]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N
DYRK1A_N 1.000000 0.959513 0.359232 0.294604 0.325260 -0.181222 -0.093845 -0.180563 0.046837 0.791254
ITSN1_N 0.959513 1.000000 0.459727 0.422543 0.426311 -0.147983 -0.076594 -0.133182 0.170720 0.780957
BDNF_N 0.359232 0.459727 1.000000 0.805691 0.749772 0.317602 0.390559 0.246851 0.603838 0.451267
NR1_N 0.294604 0.422543 0.805691 1.000000 0.873872 0.211376 0.244162 0.300849 0.597086 0.416671
NR2A_N 0.325260 0.426311 0.749772 0.873872 1.000000 0.109871 0.111020 0.280193 0.392165 0.409719
pAKT_N -0.181222 -0.147983 0.317602 0.211376 0.109871 1.000000 0.825115 0.457393 0.597263 0.037076
pBRAF_N -0.093845 -0.076594 0.390559 0.244162 0.111020 0.825115 1.000000 0.372253 0.586975 0.116827
pCAMKII_N -0.180563 -0.133182 0.246851 0.300849 0.280193 0.457393 0.372253 1.000000 0.404827 -0.083623
pCREB_N 0.046837 0.170720 0.603838 0.597086 0.392165 0.597263 0.586975 0.404827 1.000000 0.211590
pELK_N 0.791254 0.780957 0.451267 0.416671 0.409719 0.037076 0.116827 -0.083623 0.211590 1.000000

3-2 correlation matrix with heatmap

In [63]:
pcorr.style.background_gradient().set_precision(2)
Out[63]:
DYRK1A_N ITSN1_N BDNF_N NR1_N NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N pELK_N
DYRK1A_N 1 0.96 0.36 0.29 0.33 -0.18 -0.094 -0.18 0.047 0.79
ITSN1_N 0.96 1 0.46 0.42 0.43 -0.15 -0.077 -0.13 0.17 0.78
BDNF_N 0.36 0.46 1 0.81 0.75 0.32 0.39 0.25 0.6 0.45
NR1_N 0.29 0.42 0.81 1 0.87 0.21 0.24 0.3 0.6 0.42
NR2A_N 0.33 0.43 0.75 0.87 1 0.11 0.11 0.28 0.39 0.41
pAKT_N -0.18 -0.15 0.32 0.21 0.11 1 0.83 0.46 0.6 0.037
pBRAF_N -0.094 -0.077 0.39 0.24 0.11 0.83 1 0.37 0.59 0.12
pCAMKII_N -0.18 -0.13 0.25 0.3 0.28 0.46 0.37 1 0.4 -0.084
pCREB_N 0.047 0.17 0.6 0.6 0.39 0.6 0.59 0.4 1 0.21
pELK_N 0.79 0.78 0.45 0.42 0.41 0.037 0.12 -0.084 0.21 1

3-3 heatmap to show correlation

In [67]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(pcorr, mask=np.zeros_like(pcorr, dtype=np.bool), cmap="Blues",
            square=True, ax=ax)
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f80292e5c50>

4. box plot

  • 以 DYRK1A_N 為例
  • 將資料分組畫 boxplot 可看出各 class 的資料離散程度
  • 例如 c-CS-s 就有一些離群值,如同作業三和作業四提到的。
In [25]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='class', ax=ax)
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8028e35a58>
  • 也可以用 Genotype 分別畫 box plot
In [26]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.suptitle('')
fill_protein.boxplot(column=['DYRK1A_N'], by='Genotype', ax=ax)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8028dadac8>

5. histogram with density

  • 以前 4 個蛋白質表現量為例
In [99]:
sns.set(style="white", palette="muted", color_codes=True)
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(9,9))
sns.distplot(fill_protein['DYRK1A_N'], color="b", ax=axes[0, 0])
sns.distplot(fill_protein['ITSN1_N'], color="r", ax=axes[0, 1])
sns.distplot(fill_protein['BDNF_N'], color="g", ax=axes[1, 0])
sns.distplot(fill_protein['NR1_N'], color="m", ax=axes[1, 1])
#plt.setp(axes)
#plt.tight_layout()
plt.show()

6. others discussion

  • 這份資料的數值可解釋性我覺得不算高,不太好理解每欄的數值到底代表什麼意思,但由這些作圖其實可以找出一些分布相似或是相關性高的屬性。
    • 例如 [DYRK1A_N, ITSN1_N] 或是 [NR1_N, pNR1_N]
  • 這個資料集還蠻適合做 clustering 或 classification,接下來我應該會試著用 self-organizing map 來做 clustering
    找出相關性高的 attribute 除了更方便分群合併相鄰 attribute 外,
    也適合在降維(PCA,LDA)的時候把不相關的幾個 attribute 先剔除,降低整體複雜度。

1. 解釋為何使用這些變數當作 x 變量或 y 變量

  • 挑選 X = NR1_N 和 y = pNR1_N,因已知兩種蛋白質為生物上同源且化學結構類似,應有高度相關性,方便迴歸預測
  • 由 Hw5 使用過的 Relation DataFrame 取出這兩行資料。
  • 將 X, y 取 20% 出來做 testing data,其餘 80% 為 training data
In [245]:
Relation.describe(percentiles=[])
Out[245]:
NR1_N pNR1_N
count 1080.000000 1080.000000
mean 2.297134 0.825752
std 0.346819 0.117811
min 1.330831 0.500160
50% 2.295648 0.820850
max 3.757641 1.408169
In [396]:
X, y = Relation.iloc[:,0].values.reshape(-1,1), Relation.iloc[:,1].values.reshape(-1,1)
In [397]:
from sklearn.metrics import mean_squared_error
row = ['linear', 'knn', 'ridge', 'linear^3', 'linear^4+intercept']
col = ['MSE', 'Cor']
regResult= pd.DataFrame(index=row, columns=col)
In [398]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

2. Regression Models

2-1 Simple linear regression

  • $ \hat{y}_{lm} = w_0 + w_1x $
  • 用藍線表 linear model 預測 testing data 的值 ( $\hat{y}_{lm}$ )
  • 用黑點表 testing data 的散佈圖 ( $y$ )
  • 並印出 model 的 coefficient, intercept, correlation
In [423]:
from sklearn.linear_model import LinearRegression

# Train linear model by training set
reg = LinearRegression().fit(X_train, y_train)

# Make predictions using the testing set
y_pred_lm = reg.predict(X_test)

# The coefficients
print('Coefficients: \n', reg.coef_)
print('Intercept: \n', reg.intercept_)
print('linear model Correlation: \n', reg.score(X_train, y_train))


# Plot outputs
plt.scatter(X_test, y_test,  color='black', label='test data')
plt.plot(X_test, y_pred_lm, color='blue', linewidth=3, label='linear model prediction')

regResult.iloc[0, 0] = mean_squared_error(y_test, y_pred_lm)
regResult.iloc[0, 1] = model1.score(X_train, y_train)

plt.legend()
plt.show()
Coefficients: 
 [[0.32068714]]
Intercept: 
 [0.0889937]
linear model Correlation: 
 0.8944103968452173

2-2 k-nearest-neighbor(kNN) Regression with arithmetic progression

  • 用綠線表 kNN model 預測 testing data 的值 ( $\hat{y}_{kNN}$ )
    • k 取 5,其實因為這兩個變數相關性過高,導致 k 取多少其實差不多
    • 有用 uniform 和 distance 兩種方式來計算 neighbor,但結果其實差不多。
  • 和 linear model 做比較
    • 用藍線表 linear model 預測 testing data 的值 ( $\hat{y}_{lm}$ )
    • 用黑點表 testing data 的散佈圖 ( $y$ )a
  • 並印出 kNN model 的 correlation

  • 可以看到右上角有一個比較遠的點有點像離群值,當 test data 挑到那個點的時候會導致 knn model 有點歪掉

In [400]:
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test.min(), X_test.max(), 108)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(k, weights=weights)
    y_pred_knn = knn.fit(X_train, y_train).predict(T)

    
    plt.subplot(2, 1, i + 1)
    plt.scatter(X_test, y_test, c='k', label='test data')
    plt.plot(T, y_pred_knn, c='g', label='knn prediction', linewidth=4)
    plt.plot(X_test, y_pred_lm, color='blue', linewidth=4, label='linear model prediction')
    #plt.axis('tight')
    plt.ylim(y_test.min()-y_test.min()/5, y_test.max()+y_test.max()/5)
    plt.legend()
    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
                                                                weights))
# The coefficients
print('kNN Correlation: \n', knn.score(X_train, y_train))
plt.tight_layout()
plt.show()
kNN Correlation: 
 1.0

2-3 k-nearest-neighbor(kNN) Regression with testing data

  • 用綠線表 kNN model 預測 testing data 的值 ( $\hat{y}_{kNN}$ )
    • k 取 5,其實因為這兩個變數相關性過高,導致 k 取多少其實差不多
    • 有用 uniform 和 distance 兩種方式來計算 neighbor,但結果其實差不多。
  • 和 linear model 做比較
    • 用藍線表 linear model 預測 testing data 的值 ( $\hat{y}_{lm}$ )
    • 用黑點表 testing data 的散佈圖 ( $y$ )a
  • 並印出 kNN model 的 correlation
  • testing data 若沒有排序的話,kNN 的折線圖容易橫向的去連接下一個點,看起來不像迴歸線
In [401]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2)
In [402]:
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test2.min(), X_test2.max(), 216)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(k, weights=weights)
    y_pred_knn = knn.fit(X_train2, y_train2).predict(X_test2)

    
    plt.subplot(2, 1, i + 1)
    plt.scatter(X_test2, y_test2, c='k', label='data')
    plt.plot(X_test2, y_pred_knn, c='g', label='prediction', linewidth=3)
    plt.plot(X_test, y_pred_lm, color='blue', linewidth=3)
    #plt.axis('tight')
    plt.ylim(y_test2.min()-y_test2.min()/5, y_test2.max()+y_test2.max()/5)
    plt.legend()
    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
                                                                weights))
# The coefficients
print('Correlation: \n', knn.score(X_train, y_train))



plt.tight_layout()
plt.show()
Correlation: 
 0.971731783087436
  • 將 X_test 排序完之後再放入 model 去 predict,畫出來就比較好看了
In [410]:
from sklearn import neighbors
# Fit regression model
k = 5
T = np.linspace(X_test2.min(), X_test2.max(), 108)[:, np.newaxis]
plt.figure(figsize=(12,12))
for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(k, weights=weights)
    y_pred_knn = knn.fit(X_train2, y_train2).predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))

    
    plt.subplot(2, 1, i + 1)
    plt.scatter(X_test2, y_test2, c='k', label='data')
    plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y_pred_knn, c='g', label='prediction', linewidth=3)
    plt.plot(X_test, y_pred_lm, color='blue', linewidth=3)
    #plt.axis('tight')
    plt.ylim(y_test2.min()-y_test2.min()/5, y_test2.max()+y_test2.max()/5)
    plt.legend()
    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (k,
                                                                weights))

regResult.iloc[1, 0] = mean_squared_error(y_test2, y_pred_knn)
regResult.iloc[1, 1] = model1.score(X_train2, y_train2)

# The coefficients
print('Correlation: \n', knn.score(X_train, y_train))
plt.tight_layout()
plt.show()
Correlation: 
 0.971731783087436

2-4 Polynomial Ridge Regression

  • Ridge Regression
    • 用預設的 Ridge regression 做 model,alpha 值預設為 1
  • 加入平方項和三次方項建 model
    • 紅線表示 $\hat{y}_{1} = w_1X$
    • 橘線表示 $\hat{y}_{2} = w_1X+w_2X^2$
    • 黃線表示 $\hat{y}_{3} = w_1X+w_2X^2+w_3X^3$
  • 因變相相關性過大,其實結果跟 linear model 看起來很像,不太需要用 ridge regression
  • 隨著使用的變數變多,Correlation 有隨著提升
In [412]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

colors = ['red', 'orange', 'teal', 'yellowgreen', 'gold']
lw = 2
#plt.plot(X_train, y_train, color='cornflowerblue', linewidth=lw,
#         label="ground truth")
plt.scatter(X_test2, y_test2, color='navy', s=30, marker='o', label="training points")

for count, degree in enumerate([1, 2, 3]):
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    model.fit(X_train2, y_train2)
    y_polyRidge = model.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))
    print('Correlation%d: \n' % degree, model.score(X_train2, y_train2))
    plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y_polyRidge, color=colors[count], linewidth=lw,
             label="degree %d" % (degree))

    
regResult.iloc[2, 0] = mean_squared_error(y_test2, y_polyRidge)
regResult.iloc[2, 1] = model1.score(X_train2, y_train2)
    
    
plt.legend()
plt.show()
Correlation1: 
 0.8980479864416075
Correlation2: 
 0.8993041406589958
Correlation3: 
 0.8993910962218653

2-5 Polynomial Linear Regression

  • 用深綠粗線表 model1 ( $\hat{y}_{m1} = w_1X + w_2X^2 + w_3X^3$ )
  • 用淺綠細線表 model2 ( $\hat{y}_{m2} = w_0+ w_1X + w_2X^2 + w_3X^3 + w_4X^4$ )

  • 雖然可以決定 model 要不要考慮 intercept 項,但因這兩個變數相關性太高,導致不管有沒有考慮 train 出來的 model 都長差不多。

    • 因此 model2 除了增加 intercept 項,再多增加一個 4 次方項以和 model1 比較。
    • 其實會發現 Correlation 並沒差多少,model 和 data 已經相當 fit 了。
In [421]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
model1 = Pipeline([('poly', PolynomialFeatures(degree=3)),
                  ('linear', LinearRegression(fit_intercept=False))])
# fit to an order-3 polynomial data
model1 = model1.fit(X_train2, y_train2)
model1.named_steps['linear'].coef_
model1.named_steps['linear'].intercept_

# The coefficients
print('Coefficients: \n', model1.named_steps['linear'].coef_)
print('Intercept: \n', model1.named_steps['linear'].intercept_)
print('Correlation: \n', model1.score(X_train2, y_train2))

y1 = model1.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))

regResult.iloc[3, 0] = mean_squared_error(y_test2, y1)
regResult.iloc[3, 1] = model1.score(X_train2, y_train2)
Coefficients: 
 [[-0.20521615  0.78254396 -0.22993012  0.03666342]]
Intercept: 
 0.0
Correlation: 
 0.9012322379889552
In [417]:
model2 = Pipeline([('poly', PolynomialFeatures(degree=4)),
                  ('linear', LinearRegression(fit_intercept=True))])
# fit to an order-3 polynomial data
model2 = model2.fit(X_train2, y_train2)
model2.named_steps['linear'].coef_
model2.named_steps['linear'].intercept_

# The coefficients
print('Coefficients: \n', model2.named_steps['linear'].coef_)
print('Intercept: \n', model2.named_steps['linear'].intercept_)
print('Correlation: \n', model2.score(X_train2, y_train2))

y2 = model2.predict(np.sort(X_test2.reshape(1,-1)).reshape(-1,1))

regResult.iloc[4, 0] = mean_squared_error(y_test2, y2)
regResult.iloc[4, 1] = model2.score(X_train2, y_train2)
Coefficients: 
 [[ 0.          2.05707897 -1.05057645  0.26560611 -0.02336814]]
Intercept: 
 [-0.92814585]
Correlation: 
 0.9015125541246998
In [419]:
plt.figure(figsize=(8, 6))
plt.scatter(X_test2, y_test2, color='navy', s=30, marker='o', label="training points")
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y1, c='teal', label='fit_intercept=True', linewidth=5)
plt.plot(np.sort(X_test2.reshape(1,-1)).reshape(-1,1), y2, c='yellowgreen', label='fit_intercept=False')
plt.legend()
plt.show()

3. Model Evaluation

  • 使用基本的 Mean Square Error
    • 結果最基本的 linear model 就已經是 error 最小的了...
In [426]:
regResult.assign()
Out[426]:
MSE Cor
linear 0.00144936 0.896963
knn 0.0192147 0.901232
ridge 0.0194223 0.901232
linear^3 0.0192919 0.901232
linear^4+intercept 0.0192743 0.901513

4. Correlation between X & y

  • 可以看到這次挑的變數相關性過大,導致各種 model 結果其實差不多。
  • 這份 dataset 好像沒有比較適合迴歸的資料,也許有機會再用其他筆資料試試看。
In [424]:
regResult.assign()
Out[424]:
MSE Cor
linear 0.00144936 0.896963
knn 0.0192147 0.901232
ridge 0.0194223 0.901232
linear^3 0.0192919 0.901232
linear^4+intercept 0.0192743 0.901513